import cv2
import pandas as pd
import os.path
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
from sklearn.cluster import KMeans
imgdir="images2"
df = pd.read_csv('df_nonLinear.csv').drop(columns='Unnamed: 0')
df = df[df.Artwork_Image.notna()]
# replace url_prefix with imgdir (this is where we downloaded the images)
url_prefix = 'http://artinfo-images-350.s3.amazonaws.com'
df['Artwork_Image_Path'] = df.Artwork_Image.apply(lambda x: x.replace(url_prefix, imgdir))
# keep the rows if we have images
df = df[df.Artwork_Image_Path.map(lambda x: os.path.exists(x))]
df.info()
def _mean(im): return [im[:,:,0].mean(), im[:,:,1].mean(), im[:,:,2].mean()]
# opencv read in BGR, we convert it to RGB and take mean for every channel
df_rgb = df.Artwork_Image_Path.apply(lambda imgpath: pd.Series(
[imgpath] + _mean(cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2RGB)),
index="Artwork_Image_Path r g b".split()))
# LAB
df_lab = df.Artwork_Image_Path.apply(lambda imgpath: pd.Series(
[imgpath] + _mean(cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2LAB)),
index="Artwork_Image_Path l a b".split()))
# HSV
df_hsv = df.Artwork_Image_Path.apply(lambda imgpath: pd.Series(
[imgpath] + _mean(cv2.cvtColor(cv2.imread(imgpath), cv2.COLOR_BGR2HSV)),
index="Artwork_Image_Path h s v".split()))
df_rgb.describe().join(df_lab.describe(), rsuffix="_lab").join(df_hsv.describe()).round(2)
# compute k-mean
kmeans_rgb = KMeans(n_clusters=8, random_state=0).fit(df_rgb[['r','g','b']])
kmeans_lab = KMeans(n_clusters=8, random_state=0).fit(df_lab[['l','a','b']])
kmeans_hsv = KMeans(n_clusters=8, random_state=0).fit(df_hsv[['h','s','v']])
pd.DataFrame.from_records({
"RGB": [kmeans_rgb.n_iter_, kmeans_rgb.max_iter, kmeans_rgb.n_iter_ < kmeans_rgb.max_iter],
"LAB": [kmeans_lab.n_iter_, kmeans_lab.max_iter, kmeans_lab.n_iter_ < kmeans_lab.max_iter],
"HSV": [kmeans_hsv.n_iter_, kmeans_hsv.max_iter, kmeans_hsv.n_iter_ < kmeans_hsv.max_iter]},
index=['n_iter','max_iter','converged']).T
# algo params
pd.DataFrame.from_records([kmeans_hsv.get_params(),kmeans_lab.get_params(),kmeans_rgb.get_params()],
index="HSV LAB RGB".split())
# center of each cluster
pd.DataFrame.from_records({
'HSV':map(lambda x: tuple(x), kmeans_hsv.cluster_centers_.round(1)),
'LAB':map(lambda x: tuple(x), kmeans_lab.cluster_centers_.round(1)),
'RGB':map(lambda x: tuple(x), kmeans_rgb.cluster_centers_.round(1))},
index=pd.Index(range(8),name='label'))
# how many items in each cluster
def _uniq(arr):
label, count = np.unique(arr, return_counts=True)
return dict(zip(label, count))
pd.DataFrame({
'HSV':_uniq(kmeans_hsv.labels_),
'LAB':_uniq(kmeans_lab.labels_),
'RGB':_uniq(kmeans_rgb.labels_)}).T
def _join(_df, _km, _cols):
return _df.join(pd.DataFrame.from_records(
map(lambda x: [x] + _km.cluster_centers_[x].tolist(), _km.labels_),
index=_df.index,
columns=_cols))
df_rgb_km = _join(df_rgb, kmeans_rgb, ['cluster','center_r','center_g','center_b'])
df_hsv_km = _join(df_hsv, kmeans_hsv, ['cluster','center_h','center_s','center_v'])
df_lab_km = _join(df_lab, kmeans_lab, ['cluster','center_l','center_a','center_b'])
df_rgb_km.head()
df_hsv_km.head()
df_lab_km.head()
# prefix to column names
def _prefix_cols(_df, prefix):
return _df.rename(columns=dict(map(lambda c: (c,prefix+c),
filter(lambda c: c!='Artwork_Image_Path', _df.columns))))
df_all = df.merge( _prefix_cols(df_rgb_km,'rgb_').merge(
_prefix_cols(df_hsv_km,'hsv_'), on='Artwork_Image_Path').merge(
_prefix_cols(df_lab_km,'lab_'), on='Artwork_Image_Path'),
on='Artwork_Image_Path')
df_all.applymap(
# round float cols
lambda x: round(x,2) if isinstance(x,float) else x).applymap(
# replace newline
lambda x: ' '.join(x.splitlines()) if isinstance(x, str) else x
).to_csv('artwork_img_kmean.csv', index=False)
df_all.info()
from PIL import Image
def _imshow(_df, _type, _label, _ax_title):
fig,axes = plt.subplots(2, 4, figsize=[12,8])
path_prefix = os.path.commonprefix(_df.Artwork_Image_Path.tolist())
for i,iax in enumerate(axes.flatten()):
_row = _df.iloc[i]
_imgpath = _row.Artwork_Image_Path
iax.imshow(Image.open(_imgpath).convert('RGB')) # , cmap='gray', , interpolation='nearest'
iax.axis("off")
iax.set_title("{}\n{}".format(_ax_title(_row),_imgpath.replace(path_prefix,'')))
_ = fig.suptitle("{} cluster={}".format(_type, _label),x=0.5,y=0.95)
def _imshow_rgb(_df, _label):
_imshow(_df, "RGB", _label,
_ax_title=lambda r: "cluster={} [{} {} {}]".format(
int(r.cluster),
int(r.r),
int(r.g),
int(r.b)))
def _imshow_hsv(_df, _label):
_imshow(_df, "HSV", _label,
_ax_title=lambda r: "cluster={} [{} {} {}]".format(
int(r.cluster),
int(r.h),
int(r.s),
int(r.v)))
def _imshow_lab(_df, _label):
_imshow(_df, "LAB", _label,
_ax_title=lambda r: "cluster={} [{} {} {}]".format(
int(r.cluster),
int(r.l),
int(r.a),
int(r.b)))
df_rgb_km_groups = df_rgb_km.groupby('cluster')
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[0]].sample(16),0)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[1]].sample(16),1)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[2]].sample(16),2)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[3]].sample(16),3)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[4]].sample(16),4)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[5]].sample(16),5)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[6]].sample(16),6)
_imshow_rgb(df_rgb_km.loc[df_rgb_km_groups.groups[7]].sample(16),7)
df_hsv_km_groups = df_hsv_km.groupby('cluster')
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[0]].sample(16),0)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[1]].sample(16),1)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[2]].sample(16),2)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[3]].sample(16),3)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[4]].sample(16),4)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[5]].sample(16),5)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[6]].sample(16),6)
_imshow_hsv(df_hsv_km.loc[df_hsv_km_groups.groups[7]].sample(16),7)
df_lab_km_groups = df_lab_km.groupby('cluster')
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[0]].sample(16),0)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[1]].sample(16),1)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[2]].sample(16),2)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[3]].sample(16),3)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[4]].sample(16),4)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[5]].sample(16),5)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[6]].sample(16),6)
_imshow_lab(df_lab_km.loc[df_lab_km_groups.groups[7]].sample(16),7)